import numpy as np
import pandas as pd
import plotly
import cufflinks as cf
import seaborn as sns
plotly.tools.set_credentials_file(username='DesciuitV',api_key='Q1xpEcxmp80nvR9Cho16')
import plotly.plotly as py
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode,iplot
import plotly.io as pio
from ipywidgets import interactive, HBox, VBox, widgets, interact
init_notebook_mode(connected=True)
#cf.set_config_file(theme='ggplot')
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
df1 = pd.read_csv("business.csv",dtype={'business_id': str})
df2 = pd.read_csv("review.csv",dtype={'business_id': str, 'review_id': str,'user_id':str})
df1.dropna(subset=['categories'],inplace=True)
cat = "Restaurants|Ice Cream & Frozen Yogurt|Food|Coffee & Tea|Desserts|Pizza|Bars|Fast Food|FastFood|Sandiwches|Bakeries|Delis|Breakfast|Pub|Bars|Cafe"
nocat = "Grocery|Spa|Nail Salons|Beauty & Spas|Barbers|Hair Salons|Tattoo|Hair|Nails|Auto|Automotive|Auto Repair|Car Rental|Car Dealers|Real Estate|Apartments|Home Services|Pest Control|Self StorageBanks|Credit Unions|Financial ServicesVeterinarians|Pets|Pet StoresHealth & Medical|Doctors|Health|DentistsLocal Services|Laundry|Libraries|Repair|Cinema|Arts|Public Services|Government|Professional|Religious Organizations|ChurchesProfessional ServicesShoppingActive Life|Yoga|Golf|Swimming Pools|Hiking|Amusement Parks|Gyms|Education|Mass Media|Radio Stations|Local Flavor|Guns|Tax|Magic|Boat|Tours|Watches|Bike"
df1.loc[(df1['categories'].str.contains(cat)) & (~df1['categories'].str.contains(nocat)),'categories'] = 'Restaurants/Food'
df1.loc[(df1['categories'].str.contains("Grocery")) & (df1['categories'].str.contains("Food")),'categories'] = 'Supermarket/ConvenienceStores'
df1.loc[df1['categories'].str.contains("Nail Salons|Beauty & Spas|Barbers|Hair Salons|Tattoo|Hair|Nails"),'categories'] = 'Beauty/Spa Services'
df1.loc[df1['categories'].str.contains("Auto|Automotive|Auto Repair|Car Rental|Car Dealers"),'categories'] = 'Car Services'
df1.loc[df1['categories'].str.contains("Real Estate|Apartments|Home Services|Pest Control|Self Storage"),'categories'] = 'Real Estate & Home Services'
df1.loc[df1['categories'].str.contains("Banks|Credit Unions|Financial Services"),'categories'] = 'Financial Services'
df1.loc[df1['categories'].str.contains("Veterinarians|Pets|Pet Stores"),'categories'] = 'Pet Stores/Services'
df1.loc[df1['categories'].str.contains("Hotels|Event|Travel"),'categories'] = 'Hotels & Event Planning'
df1.loc[df1['categories'].str.contains("Health & Medical|Doctors|Health|Dentists"),'categories'] = 'Health & Medical Services'
df1.loc[df1['categories'].str.contains("Local Services|Laundry|Libraries|Repair|Cinema|Arts|Public Services|Government|Professional|Religious Organizations|Churches"),'categories'] = 'Local/Government Services'
df1.loc[df1['categories'].str.contains("Professional Services"),'categories'] = 'Professional Services'
df1.loc[df1['categories'].str.contains("Shopping"),'categories'] = 'Shopping'
df1.loc[df1['categories'].str.contains("Active Life|Yoga|Golf|Swimming Pools|Hiking|Amusement Parks|Gyms"),'categories'] = 'Active Life'
df1.loc[df1['categories'].str.contains("Education"),'categories'] = 'Education'
df1.loc[df1['categories'].str.contains("Night Life|Karaoke|NightLife|Nightlife"),'categories'] = 'Night Life'
df1.loc[df1['categories'].str.contains("Mass Media|Radio Stations"),'categories'] = 'Mass Media/Radio Stations'
df1.loc[df1['categories'].str.contains("Local Flavor"),'categories'] = 'Local Flavor'
dfa = df1['categories'].value_counts()[:10].sort_values(ascending=True)
label = dfa.index
size = dfa.values
trace = go.Bar(x=size,y=label,marker=dict(color=size,colorscale='Jet',showscale=False,reversescale=True)
,text=size,textposition='auto',orientation='h')
data= [trace]
layout = go.Layout(title='Top 10 Categories in Yelp',yaxis=dict(title='Categories',automargin=True),
xaxis=dict(title="Number of Businesses"),showlegend=False)
fig=go.Figure(data=data,layout=layout)
pio.write_image(fig, 'fig1.svg')
plotly.offline.iplot(fig)
dfr = df1[df1.categories.str.contains('Restaurants/Food')]
dfr = dfr.drop(['Unnamed: 0','address','attributes','hours'],axis=1)
dfr = dfr.dropna(subset=['postal_code'],axis=0)
dfr.city=dfr.city.astype(str).str.upper()
dfr.loc[dfr['city'].str.contains('MONTRÃ|MONTRE|MONTéAL'),'city'] = 'MONTREAL'
dfr.loc[dfr['city'].str.contains('TORO|TORN'),'city'] = 'TORONTO'
dfr.loc[dfr['city'].str.contains('MISSISSAUGA|MISSISAUGA|MISSISSUAGA'),'city'] = 'MISSISSAUGA'
dfr.loc[dfr['city'].str.contains('LAS V|LASV|LAS VEGAS'),'city'] = 'LAS VEGAS'
dfr.loc[dfr['city'].str.contains('YORK'),'city'] = 'YORK'
dfr.loc[dfr['city'].str.contains('CALGARY'),'city'] = 'CALGARY'
# Function to format numbers to have comma thousands seperator
def thousands_format(x):
return list(map('{:,d}'.format,x))
dfr1 = dfr.groupby(['city'])['review_count'].sum().sort_values(ascending=False)[:15].sort_values(ascending=True)
label = dfr1.index
size = dfr1.values
trace = go.Bar(x=size,y=label,marker=dict(color=size,colorscale='Jet',showscale=False,reversescale=True)
,text=thousands_format(size),textposition='auto',orientation='h')
data= [trace]
layout = go.Layout(title='Total Number of Reviews In Each City',yaxis=dict(title='City',automargin=True),
xaxis=dict(title="Number of Reviews",automargin=True),showlegend=False,width=1050,height=700)
fig=go.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)
t = dfr[dfr['city']=='TORONTO']
mt = dfr[dfr['city']=='MONTREAL']
lv = dfr[dfr['city']=='LAS VEGAS']
p = dfr[dfr['city']=='PHOENIX']
dfr1 = t['stars'].value_counts()
label = dfr1.index
size = dfr1.values
trace = go.Bar(x=label,y=size,marker=dict(color=size,colorscale='Viridis',showscale=False,reversescale=True,line=dict(color='rgb(8,48,107)',width=2))
,text=size,textposition='auto',orientation='v',opacity=0.7)
data= [trace]
layout = go.Layout(title='Star Rating Distribution in Toronto',font=dict(size=16,color='black'),
yaxis=dict(title='# of Businesses',automargin=True),
xaxis=dict(title="Star Ratings",automargin=True),showlegend=False,width=1050,height=700)
fig=go.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)
stars = sorted(t['stars'].unique())
mapbox_access_token = 'pk.eyJ1IjoiZGVzY2l1aXR2IiwiYSI6ImNqcnZ1OWZ6MjA1eDYzeXBpOG5sZWd1NGcifQ.GQV-bvZFT62xzUBu2d4s6g'
map_style = "mapbox://styles/desciuitv/cjrwd4gn215au1ftfaphiuhhl"
my_text=['Name:' + name +'<br>Stars:' + str(stars) +'<br>Number of Reviews:' + str(revs)
for name, stars, revs in zip(list(t['name']), list(t['stars']),
list(t['review_count'])) ]
data = [
go.Scattermapbox(
lat=t['latitude'],
lon=t['longitude'],
mode='markers',
marker=dict(size=6, color='gold', opacity=.5),
text=my_text
)
]
layout = go.Layout(
autosize=True,
hovermode='closest',
width=1050,
height=750,
mapbox=dict(
accesstoken=mapbox_access_token,
bearing=70,
center=dict(
lat=43.6512775,
lon=-79.38878,
),
pitch=70,
zoom=14,
style=map_style,
),
)
fig = dict(data=data, layout=layout)
plotly.offline.iplot(fig)
mapbox_access_token = 'pk.eyJ1IjoiZGVzY2l1aXR2IiwiYSI6ImNqcnZ1OWZ6MjA1eDYzeXBpOG5sZWd1NGcifQ.GQV-bvZFT62xzUBu2d4s6g'
map_style = "mapbox://styles/desciuitv/cjrwd4gn215au1ftfaphiuhhl"
#make figure
steps=[]
figure = {'data':[],
'layout':{},
'frames':[]
}
figure['layout']['autosize'] = True
figure['layout']['hovermode'] ='closest'
figure['layout']['width'] = 1050
figure['layout']['height'] = 750
figure['layout']['mapbox'] = {'accesstoken':mapbox_access_token,'bearing':0,'center':{'lat':43.671094,'lon':-79.3874455},
'pitch':60,'zoom':12,'style':map_style}
figure['layout']['sliders'] = {
'args': ["transition", {
'duration': 1000,
'easing': 'cubic-in-out'
}
],
# cast it to string because javascript doesn't have separate interger/float datatypes. As such, we cast it as string in Python
'initialValue': str(stars[0]),
'plotlycommand': 'animate',
'values':str(stars[0]),
'visible': True
}
figure['layout']['updatemenus'] = [
{
'buttons': [
{
'args': [None, {'frame': {'duration': 1000, 'redraw': False},
'fromcurrent': True, 'transition': {'duration': 300, 'easing': 'cubic-in-out'},'mode':'immediate'}],
'label': 'Play',
'method': 'animate'
},
{
'args': [[None], {'frame': {'duration': 0, 'redraw': False}, 'mode': 'immediate',
'transition': {'duration': 0}}],
'label': 'Pause',
'method': 'animate'
}
],
'direction': 'left',
'pad': {'r': 10, 't': 87},
'showactive': False,
'type': 'buttons',
'x': 0.1,
'xanchor': 'right',
'y': 0,
'yanchor': 'top'
}
]
sliders_dict = {
'active': 0, #determines which button is considered active
'steps': [],
'yanchor': 'top',
'xanchor': 'left',
'currentvalue': {
'font': {'size': 20},
'prefix': 'Stars:',
'visible': True,
'xanchor': 'right'
},
'transition': {'duration':300, 'easing': 'cubic-in-out'},
'pad': {'b': 10, 't': 50},
'len': 0.9,
'x': 0.1,
'y': 0
}
#
for star in stars:
slider_step = {'args': [
[str(star)],
{'frame': {'duration': 1000, 'redraw': False},
'mode': 'immediate',
'transition': {'duration': 300}}
],
'label':str(star),'value':str(star),
'method': 'animate'}
sliders_dict['steps'].append(slider_step)
#make data
for star in stars[:1]:
t_by_stars = t[t['stars']==star]
data_dict = {
'type':'scattermapbox',
'lat': list(t_by_stars['latitude']),
'lon': list(t_by_stars['longitude']),
'mode':'markers',
'marker':{'size':6,'color':'gold','opacity':0.5},
'text':['Name:' + name + '<br>Number of Reviews:' + str(revs)
for name, revs in zip(list(t_by_stars['name']), list(t_by_stars['review_count']))]
}
figure['data'].append(data_dict)
#make frames
frames=[]
for star in stars:
#frame = {'data':[],'name':str(star)}
t_by_stars = t[t['stars']==star]
data_dict = {
'type':'scattermapbox',
'lat': list(t_by_stars['latitude']),
'lon': list(t_by_stars['longitude']),
'mode':'markers',
'marker':{'size':6,'color':'gold','opacity':0.5},
'text':['Name:' + name + '<br>Number of Reviews:' + str(revs)
for name, revs in zip(list(t_by_stars['name']), list(t_by_stars['review_count']))]
}
frame = {'data':[data_dict],'name':str(star)}
#frame['data'].append(data_dict)
frames.append(frame)
#figure['frames'].append(frame) <- avoid this cause we want the list to contain only a single trace.. this has all the trace across all stars
figure['frames'] = frames
figure['layout']['sliders']=[sliders_dict]
plotly.offline.iplot(figure)
#iplot(figure)
dfr1 = mt['stars'].value_counts()
label = dfr1.index
size = dfr1.values
trace = go.Bar(x=label,y=size,marker=dict(color=size,colorscale='Viridis',showscale=False,reversescale=True,line=dict(color='rgb(8,48,107)',width=2))
,text=size,textposition='auto',orientation='v',opacity=0.7)
data= [trace]
layout = go.Layout(title='Star Rating Distribution in Montreal',font=dict(size=16,color='black'),
yaxis=dict(title='# of Businesses',automargin=True),
xaxis=dict(title="Star Ratings",automargin=True),showlegend=False,width=1050,height=700)
fig=go.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)
mapbox_access_token = 'pk.eyJ1IjoiZGVzY2l1aXR2IiwiYSI6ImNqcnZ1OWZ6MjA1eDYzeXBpOG5sZWd1NGcifQ.GQV-bvZFT62xzUBu2d4s6g'
map_style = "mapbox://styles/desciuitv/cjrwd4gn215au1ftfaphiuhhl"
my_text=['Name:' + name +'<br>Stars:' + str(stars) +'<br>Number of Reviews:' + str(revs)
for name, stars, revs in zip(list(mt['name']), list(mt['stars']),
list(mt['review_count'])) ]
data = [
go.Scattermapbox(
lat=mt['latitude'],
lon=mt['longitude'],
mode='markers',
marker=dict(
size=6,
color='gold',
opacity=.5,
),
text=my_text
)
]
layout = go.Layout(
autosize=True,
hovermode='closest',
width=1050,
height=750,
mapbox=dict(
accesstoken=mapbox_access_token,
bearing=0,
center=dict(
lat=45.489505,
lon=-73.599207,
),
pitch=60,
zoom=12,
style=map_style,
),
)
fig = dict(data=data, layout=layout)
plotly.offline.iplot(fig)
mapbox_access_token = 'pk.eyJ1IjoiZGVzY2l1aXR2IiwiYSI6ImNqcnZ1OWZ6MjA1eDYzeXBpOG5sZWd1NGcifQ.GQV-bvZFT62xzUBu2d4s6g'
map_style = "mapbox://styles/desciuitv/cjrwd4gn215au1ftfaphiuhhl"
stars = sorted(mt['stars'].unique())
#make figure
steps=[]
figure = {'data':[],
'layout':{},
'frames':[]
}
figure['layout']['autosize'] = True
figure['layout']['hovermode'] ='closest'
figure['layout']['width'] = 1050
figure['layout']['height'] = 750
figure['layout']['mapbox'] = {'accesstoken':mapbox_access_token,'bearing':0,'center':{'lat':45.489505,'lon':-73.599207},
'pitch':60,'zoom':12,'style':map_style}
figure['layout']['sliders'] = {
'args': ["transition", {
'duration': 1000,
'easing': 'cubic-in-out'
}
],
# cast it to string because javascript doesn't have separate interger/float datatypes. As such, we cast it as string in Python
'initialValue': str(stars[0]),
'plotlycommand': 'animate',
'values':str(stars[0]),
'visible': True
}
figure['layout']['updatemenus'] = [
{
'buttons': [
{
'args': [None, {'frame': {'duration': 1000, 'redraw': False},
'fromcurrent': True, 'transition': {'duration': 300, 'easing': 'cubic-in-out'},'mode':'immediate'}],
'label': 'Play',
'method': 'animate'
},
{
'args': [[None], {'frame': {'duration': 0, 'redraw': False}, 'mode': 'immediate',
'transition': {'duration': 0}}],
'label': 'Pause',
'method': 'animate'
}
],
'direction': 'left',
'pad': {'r': 10, 't': 87},
'showactive': False,
'type': 'buttons',
'x': 0.1,
'xanchor': 'right',
'y': 0,
'yanchor': 'top'
}
]
sliders_dict = {
'active': 0, #determines which button is considered active
'steps': [],
'yanchor': 'top',
'xanchor': 'left',
'currentvalue': {
'font': {'size': 20},
'prefix': 'Stars:',
'visible': True,
'xanchor': 'right'
},
'transition': {'duration':300, 'easing': 'cubic-in-out'},
'pad': {'b': 10, 't': 50},
'len': 0.9,
'x': 0.1,
'y': 0
}
#
for star in stars:
slider_step = {'args': [
[str(star)],
{'frame': {'duration': 1000, 'redraw': False},
'mode': 'immediate',
'transition': {'duration': 300}}
],
'label':str(star),'value':str(star),
'method': 'animate'}
sliders_dict['steps'].append(slider_step)
#make data
for star in stars[:1]:
mt_by_stars = mt[mt['stars']==star]
data_dict = {
'type':'scattermapbox',
'lat': list(mt_by_stars['latitude']),
'lon': list(mt_by_stars['longitude']),
'mode':'markers',
'marker':{'size':6,'color':'gold','opacity':0.5},
'text':['Name:' + name + '<br>Number of Reviews:' + str(revs)
for name, revs in zip(list(t_by_stars['name']), list(t_by_stars['review_count']))]
}
figure['data'].append(data_dict)
#make frames
frames=[]
for star in stars:
#frame = {'data':[],'name':str(star)}
mt_by_stars = mt[mt['stars']==star]
data_dict = {
'type':'scattermapbox',
'lat': list(mt_by_stars['latitude']),
'lon': list(mt_by_stars['longitude']),
'mode':'markers',
'marker':{'size':6,'color':'gold','opacity':0.5},
'text':['Name:' + name + '<br>Number of Reviews:' + str(revs)
for name, revs in zip(list(t_by_stars['name']), list(t_by_stars['review_count']))]
}
frame = {'data':[data_dict],'name':str(star)}
#frame['data'].append(data_dict)
frames.append(frame)
#figure['frames'].append(frame) <- avoid this cause we want the list to contain only a single trace.. this has all the trace across all stars
figure['frames'] = frames
figure['layout']['sliders']=[sliders_dict]
plotly.offline.iplot(figure)
#iplot(figure)
dfr1 = lv['stars'].value_counts()
label = dfr1.index
size = dfr1.values
trace = go.Bar(x=label,y=size,marker=dict(color=size,colorscale='Viridis',showscale=False,reversescale=True,line=dict(color='rgb(8,48,107)',width=2))
,text=size,textposition='auto',orientation='v',opacity=0.7)
data= [trace]
layout = go.Layout(title='Star Rating Distribution in Las Vegas',font=dict(size=16,color='black'),
yaxis=dict(title='# of Businesses',automargin=True),
xaxis=dict(title="Star Ratings",automargin=True),showlegend=False,width=1050,height=700)
fig=go.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)
mapbox_access_token = 'pk.eyJ1IjoiZGVzY2l1aXR2IiwiYSI6ImNqcnZ1OWZ6MjA1eDYzeXBpOG5sZWd1NGcifQ.GQV-bvZFT62xzUBu2d4s6g'
map_style = "mapbox://styles/desciuitv/cjrwd4gn215au1ftfaphiuhhl"
my_text=['Name:' + name +'<br>Stars:' + str(stars) +'<br>Number of Reviews:' + str(revs)
for name, stars, revs in zip(list(lv['name']), list(lv['stars']),
list(lv['review_count'])) ]
data = [
go.Scattermapbox(
lat=lv['latitude'],
lon=lv['longitude'],
mode='markers',
marker=dict(
size=6,
color='gold',
opacity=.5,
),
text=my_text,
)
]
layout = go.Layout(
autosize=True,
hovermode='closest',
width=1050,
height=750,
mapbox=dict(
accesstoken=mapbox_access_token,
bearing=0,
center=dict(
lat=36.110926,
lon=-115.173089,
),
pitch=60,
zoom=14,
style=map_style,
),
)
fig = dict(data=data, layout=layout)
plotly.offline.iplot(fig)
mapbox_access_token = 'pk.eyJ1IjoiZGVzY2l1aXR2IiwiYSI6ImNqcnZ1OWZ6MjA1eDYzeXBpOG5sZWd1NGcifQ.GQV-bvZFT62xzUBu2d4s6g'
map_style = "mapbox://styles/desciuitv/cjrwd4gn215au1ftfaphiuhhl"
stars = sorted(lv['stars'].unique())
#make figure
steps=[]
figure = {'data':[],
'layout':{},
'frames':[]
}
figure['layout']['autosize'] = True
figure['layout']['hovermode'] ='closest'
figure['layout']['width'] = 1050
figure['layout']['height'] = 750
figure['layout']['mapbox'] = {'accesstoken':mapbox_access_token,'bearing':0,'center':{'lat':36.110926,'lon':-115.173089},
'pitch':60,'zoom':11,'style':map_style}
figure['layout']['sliders'] = {
'args': ["transition", {
'duration': 1000,
'easing': 'cubic-in-out'
}
],
# cast it to string because javascript doesn't have separate interger/float datatypes. As such, we cast it as string in Python
'initialValue': str(stars[0]),
'plotlycommand': 'animate',
'values':str(stars[0]),
'visible': True
}
figure['layout']['updatemenus'] = [
{
'buttons': [
{
'args': [None, {'frame': {'duration': 1000, 'redraw': False},
'fromcurrent': True, 'transition': {'duration': 300, 'easing': 'cubic-in-out'},'mode':'immediate'}],
'label': 'Play',
'method': 'animate'
},
{
'args': [[None], {'frame': {'duration': 0, 'redraw': False}, 'mode': 'immediate',
'transition': {'duration': 0}}],
'label': 'Pause',
'method': 'animate'
}
],
'direction': 'left',
'pad': {'r': 10, 't': 87},
'showactive': False,
'type': 'buttons',
'x': 0.1,
'xanchor': 'right',
'y': 0,
'yanchor': 'top'
}
]
sliders_dict = {
'active': 0, #determines which button is considered active
'steps': [],
'yanchor': 'top',
'xanchor': 'left',
'currentvalue': {
'font': {'size': 20},
'prefix': 'Stars:',
'visible': True,
'xanchor': 'right'
},
'transition': {'duration':200, 'easing': 'cubic-in-out'},
'pad': {'b': 10, 't': 50},
'len': 0.9,
'x': 0.1,
'y': 0
}
#
for star in stars:
slider_step = {'args': [
[str(star)],
{'frame': {'duration': 1000, 'redraw': False},
'mode': 'immediate',
'transition': {'duration': 300}}
],
'label':str(star),'value':str(star),
'method': 'animate'}
sliders_dict['steps'].append(slider_step)
#make data
for star in stars[:1]:
lv_by_stars = lv[lv['stars']==star]
data_dict = {
'type':'scattermapbox',
'lat': list(lv_by_stars['latitude']),
'lon': list(lv_by_stars['longitude']),
'mode':'markers',
'marker':{'size':6,'color':'gold','opacity':0.5},
'text':['Name:' + name + '<br>Number of Reviews:' + str(revs)
for name, revs in zip(list(lv_by_stars['name']), list(lv_by_stars['review_count']))]
}
figure['data'].append(data_dict)
#make frames
frames=[]
for star in stars:
#frame = {'data':[],'name':str(star)}
lv_by_stars = lv[lv['stars']==star]
data_dict = {
'type':'scattermapbox',
'lat': list(lv_by_stars['latitude']),
'lon': list(lv_by_stars['longitude']),
'mode':'markers',
'marker':{'size':6,'color':'gold','opacity':0.5},
'text':['Name:' + name + '<br>Number of Reviews:' + str(revs)
for name, revs in zip(list(lv_by_stars['name']), list(lv_by_stars['review_count']))]
}
frame = {'data':[data_dict],'name':str(star)}
#frame['data'].append(data_dict)
frames.append(frame)
#figure['frames'].append(frame) <- avoid this cause we want the list to contain only a single trace.. this has all the trace across all stars
figure['frames'] = frames
figure['layout']['sliders']=[sliders_dict]
plotly.offline.iplot(figure)
#iplot(figure)
df2.drop(columns=['Unnamed: 0'],inplace=True)
import re
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from tqdm import tqdm, tqdm_pandas
import warnings
warnings.filterwarnings("ignore")
dfn = pd.merge(df2,df1[['business_id','name','city','state']], on='business_id',how='right')
def percentile(n):
def percentile_(x):
return np.percentile(x, n)
percentile_.__name__ = 'percentile_%s' % n
return percentile_
dfn.dropna(inplace=True)
dp = dfn['stars'].value_counts()
label = dp.index
size = dp.values
trace = go.Bar(x=label,y=size,marker=dict(color=size,colorscale='Portland',showscale=False,reversescale=True,line=dict(color='rgb(8,48,107)',width=2))
,text=thousands_format(size),textposition='auto',orientation='v',opacity=0.7)
data= [trace]
layout = go.Layout(title='Star Rating Distribution for all Reviews',font=dict(size=16,color='black'),
yaxis=dict(title='# of Businesses',automargin=True),
xaxis=dict(title="Star Ratings",automargin=True),showlegend=False,width=1050,height=700)
fig=go.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)
dfn['text_length'] = dfn['text'].apply(len)
plt.style.use('fivethirtyeight')
plt.figure(figsize=(13,13))
sns.boxplot(x='stars',y='text_length',data=dfn,palette='Accent')
#sns.boxplot(y='stars',x='text_length',data=dfm,palette='Accent')
plt.ylabel("Length of text", fontsize=17)
plt.xlabel("Stars",fontsize=17)
plt.title("Distribution of text length for star ratings")
plt.show()
dfnn = dfn.groupby('stars')['text_length'] \
.agg([('Mean',np.mean),('Median',np.median),('25th Percentile',lambda x: np.percentile(x,q = 25))]) \
.reset_index()
print(dfnn)
df2=pd.read_csv('term_freq.csv',header=None, skiprows=1,index_col=0)
df2.columns = ['negative', 'positive']
df2['total'] = df2['negative'] + df2['positive']
%%time
vect1 = CountVectorizer(stop_words='english',max_features=10000)
vect1.fit(df.reviews)
%%time
document_matrix = vect1.transform(df.reviews)
(document_matrix).shape
%%time
neg_batches = np.linspace(0,1542841,10).astype(int)
i=0
neg_tf = []
while i < len(neg_batches)-1:
batch = np.sum(document_matrix[neg_batches[i]:neg_batches[i+1]].toarray(),axis=0)
neg_tf.append(batch)
print(neg_batches[i+1],"entries' term frequency calculated")
i += 1
%%time
pos_batches = np.linspace(1542842,3085662,10).astype(int)
i=0
pos_tf = []
while i < len(pos_batches)-1:
batch = np.sum(document_matrix[pos_batches[i]:pos_batches[i+1]].toarray(),axis=0)
pos_tf.append(batch)
print(pos_batches[i+1],"entries' term frequency calculated")
i += 1
#neg_mat = vect1.transform(df[df.target == 0].reviews)
#pos_mat = vect1.transform(df[df.target == 1].reviews)
neg = np.sum(neg_tf,axis=0)
pos = np.sum(pos_tf,axis=0)
tf = pd.DataFrame([neg,pos],columns=vect1.get_feature_names()).transpose()
tf.columns = ['negative', 'positive']
tf['total'] = tf['negative'] + tf['positive']
plt.figure(figsize=(12,12))
plt.style.use('fivethirtyeight')
negwords = tf.sort_values(by='negative', ascending=False)['negative'][:50].sort_values()
y_pos = np.arange(50)
plt.barh(y_pos, negwords, color='brown',alpha=0.7)
plt.yticks(y_pos, negwords.index)
plt.ylabel('Negative Tokens')
plt.xlabel('Frequency')
plt.title('Top 50 (negative) tokens')
plt.show()
plt.figure(figsize=(12,12))
plt.style.use('fivethirtyeight')
poswords = df2.sort_values(by='positive', ascending=False)['positive'][:50].sort_values()
y_pos = np.arange(50)
plt.barh(y_pos, poswords, color='green',alpha=0.7)
plt.yticks(y_pos, poswords.index)
plt.ylabel('Positive Tokens')
plt.xlabel('Frequency')
plt.title('Top 50 (Positive) tokens')
plt.show()
plt.figure(figsize=(8,6))
ax = sns.regplot(x="negative", y="positive",fit_reg=False, scatter_kws={'alpha':0.5},data=tf)
plt.ylabel('Positive Frequency')
plt.xlabel('Negative Frequency')
plt.title('Negative Frequency vs Positive Frequency')
plt.show()
Scattertext to extract meaningful tokens from the frequency data.¶We'll try to find a more meaningful metric that will allow us to characterise important tokens in each class. Intuitively, the frequency of a word in one class over the other can be a reasonable measure of how important the word is in characterizing the class. First, we'll define the positive rate as $$\text{pos_rate} = \frac{\text{frequency of a positive word }}{\text{total occurence of a word (in positive and negative reviews)}}$$
tf['pos_rate'] = tf['positive']/tf['total']
tf.sort_values(by='pos_rate', ascending=False).head(5)
As we can see, words with the highest positive rate metric has relatively very low frequency in negative reviews. Also, the overall frequency of these words also makes it unlikely that we can use it as a reliable metric for positive reviews.
Alternatively, another metric is the frequency a words occurs in the class. This is defined as $${\text{Percentage of positive frequency}} = \frac {\text{frequency of a positive word}} {\Sigma \text{ positive words}}$$
tf['pos_freq_pct'] = tf['positive']/tf['positive'].sum()
tf.sort_values(by='pos_rate', ascending=False).head(5)
Since the positive frequency percentage is just scaled over the total sum of positive words, it has the same rank as the positive rate.
So, to come up with an effective metric that reflects both positive rate and positive frequency percentage, we will utilize harmonic mean instead of the arithmetic mean together with the CDF(Cumulative Distribution Function) of both positive rate and positive frequency rate. CDF can be explained as "distribution function of X, evaluated at x, is the probability that X will take a value less than or equal to x". The harmonic mean for our purpose helps us to draw a relationship between two rates or ratios of different units such as our two measures above that differs only in their denominator. The harmonic mean H of a positive real number is defined as: $${H} = \frac {n}{\sum_{i=1}^{n}\ \frac{1}{x_i}}$$
As we can see below, the word "delicious" has a pos_rate_normcdf of 0.994893 and pos_freq_pct_normcdf of 1. This means that roughly 99.49% of the tokens will take a pos_rate value less or equal to 0.893093 and 100% will take pos_freq_pct value of 0.004560 or less.
We see that pos_normcdf_hmean metric provides a more meaningful measure of how important a word is within the class! Next, we'll apply the same calculation to negative words.
from scipy.stats import norm
from scipy.stats import hmean
def normcdf(x):
return norm.cdf(x, x.mean(), x.std())
#cdf of positive rate
tf['pos_rate_normcdf'] = normcdf(tf['pos_rate'])
#cdf of the positive rate frequency
tf['pos_freq_pct_normcdf'] = normcdf(tf['pos_freq_pct'])
#harmonic mean of the cdf of positive rate and cdf of positive rate frequency
tf['pos_normcdf_hmean'] = hmean([tf['pos_rate_normcdf'], tf['pos_freq_pct_normcdf']])
tf.sort_values(by='pos_normcdf_hmean', ascending=False).iloc[:10]
tf['neg_rate'] = tf['negative']/tf['total']
tf['neg_freq_pct'] = tf['negative']/tf['negative'].sum()
tf['neg_rate_normcdf'] = normcdf(tf['neg_rate'])
tf['neg_freq_pct_normcdf'] = normcdf(tf['neg_freq_pct'])
tf['neg_normcdf_hmean'] = hmean([tf['neg_rate_normcdf'], tf['neg_freq_pct_normcdf']])
tf.sort_values(by='neg_normcdf_hmean', ascending=False).iloc[:10]
plt.figure(figsize=(11,8))
ax = sns.regplot(x="neg_normcdf_hmean", y="pos_normcdf_hmean",fit_reg=False, scatter_kws={'alpha':0.5},data=tf)
plt.ylabel('Positive Rate and Frequency CDF Harmonic Mean', size=14)
plt.xlabel('Negative Rate and Frequency CDF Harmonic Mean',size=14)
plt.title('neg_normcdf_hmean VS pos_normcdf_hmean')
plt.show()
import bokeh.io
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
from bokeh.resources import INLINE
from bokeh.models import LinearColorMapper
from bokeh.models import HoverTool
from bokeh.resources import INLINE
output_notebook()
color_mapper = LinearColorMapper(palette='Magma256', low=min(tf.pos_normcdf_hmean), high=max(tf.pos_normcdf_hmean))
p = figure(x_axis_label='neg_normcdf_hmean', y_axis_label='pos_normcdf_hmean')
p.circle('neg_normcdf_hmean','pos_normcdf_hmean',size=5,alpha=0.5,source=tf,color={'field': 'pos_normcdf_hmean', 'transform': color_mapper})
hover = HoverTool(tooltips=[('token','@index')])
p.add_tools(hover)
show(p)